### This R script is to make Fig 3. The code also produces Appendix Figures C1 and C2.

# open necessary packages
library(tidyverse)
library(dotwhisker)
library(fixest)

# set your directory:
setwd("/Users/olgabaker/Desktop/Replication Package Summer 2025/Data")
data_19 <- read.csv("core_19.csv")
data_23 <- read.csv("core_23.csv")
data <- list(data_19,data_23) 
names(data) <- paste("data_",c(19,23),sep="")
rm(data_19); rm(data_23) # drop individual data objects

## Make names of teacher variables consistent across years. Rename variables if their variable name changed in a given 
# year. Change it to match the name in the "variable name" column of "Teacher Variables.xlsx"
data$data_19 <- data$data_19 %>% rename(cdis1=cdis)

## import list of teacher variables
varnames <- read.csv("Teacher Variables.csv", na.strings = c(""))
varnames <- varnames$Variable.name
varnames <- varnames[!is.na(varnames)] 

# add simple_avg to varnames
varnames <- c(varnames,"simple_avg")

### 1) Split the data using the above/below median of the following in 2019: -------------------------------------
### a) IPR_130
### b) Black
### c) Median household income

### a) IPR_130
median_ipr_130 <- median(data$data_19$ipr_130, na.rm=T)
data$data_19$above_median_ipr_130 <- ifelse(data$data_19$ipr_130>=median_ipr_130, 1, 0) 
count(data$data_19,above_median_ipr_130) # checking if the sample is split in half - 0 schools missing indicator in 2019

# Merge the dummy into the 2023 data to classify schools as “above median ipr_130” or “below median ipr_130”:
dummy <- select(data$data_19,ncessch,above_median_ipr_130)
data$data_23 <- left_join(data$data_23, dummy)
summary(data$data_23) # 170 schools missing indicator in 2023

### b) Black
median_black <- median(data$data_19$Black, na.rm=T)
data$data_19$above_median_black <- ifelse(data$data_19$Black>=median_black, 1, 0)
count(data$data_19,above_median_black) # 13 schools missing indicator in 2019

# Merge this dummy into the 2023 data to classify schools as “above median Black” or “below median Black”:
dummy <- select(data$data_19,ncessch,above_median_black)
data$data_23 <- left_join(data$data_23, dummy)
summary(data$data_23) # 179 schools missing indicator in 2023

### c) Median Household Income (abbreviate as income)
median_income <- median(data$data_19$median_income, na.rm=T)
data$data_19$above_median_income <- ifelse(data$data_19$median_income>=median_income, 1, 0) 
count(data$data_19,above_median_income) # 145 schools missing indicator in 2019

# Merge this dummy into the 2023 data to classify schools as “above median income” or “below median income”:
dummy <- select(data$data_19,ncessch,above_median_income)
data$data_23 <- left_join(data$data_23, dummy)
summary(data$data_23) # 312 schools missing indicator in 2023

### 2) Get 2019-2023 differences and statistical significance for sample split by IPR_130 ------------------------
## Label the weights you'll use for each year as "weight"
data$data_19 <- mutate(data$data_19, weight=teacher_n)
data$data_23 <- mutate(data$data_23, weight=teacher_n)

## Split data:
High_SES <- lapply(data, function(x) filter(x, above_median_ipr_130==0))
Low_SES <- lapply(data, function(x) filter(x, above_median_ipr_130==1))

## Next write a function that calculates weighted average
weighted_avg <- function(x,var){ # be sure to type the variable name with "" surrounding it
  if(!is.null(x[[var]])){
    return(weighted.mean(x[[var]],x[["weight"]], na.rm=T))
  } else{return(NA)}}

### a) High_SES 
# put the calculated percentages in a table
weighted_scores <- matrix(NA, length(varnames), 2)
row.names(weighted_scores) <- varnames
colnames(weighted_scores) <- names(data)

for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  weighted_scores[i,] <- sapply(High_SES, function(x) weighted_avg(x, var))
}
weighted_scores <- as.data.frame(weighted_scores)

## Take difference from 2019-2023
Table <- matrix(NA, length(varnames), 6)
Table <- as.data.frame(Table)
colnames(Table) <- c("High_SES_ipr_130", "Low_SES_ipr_130", "High_SES_Black", "Low_SES_Black", "High_SES_income", "Low_SES_income") 
rownames(Table) <- rownames(weighted_scores)

Table$High_SES_ipr_130 <- weighted_scores$data_23 - weighted_scores$data_19

### b) Low_SES 
# put the calculated percentages in a table
weighted_scores <- matrix(NA, length(varnames), 2)
row.names(weighted_scores) <- varnames
colnames(weighted_scores) <- names(data)

for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  weighted_scores[i,] <- sapply(Low_SES, function(x) weighted_avg(x, var))
}
weighted_scores <- as.data.frame(weighted_scores)

## Take difference from 2019-2023
Table$Low_SES_ipr_130 <- weighted_scores$data_23 - weighted_scores$data_19

## c) statistical significance

# first, make 2023 and low-SES dummies
data$data_19$year_23 <- 0
data$data_23$year_23 <- 1
data$data_19$low_ses <- ifelse(data$data_19$above_median_ipr_130==1,1,0)
data$data_23$low_ses <- ifelse(data$data_23$above_median_ipr_130==1,1,0)

# write function to run statistical significance test:
# Score(it) = a0 + b1*2023(t) + b2*Low-SES(i) + b3*(2023(t)*Low-SES(i)) + u(it)
# weight by "weight" and cluster by district

# data_1 is the earlier year, data_2 is the later year
high_low_statsig <- function(data_1, data_2, var){ # be sure to type variable name with ""
  if(is.null(data_1[[var]]) | is.null(data_2[[var]])){ # clause that gives an NA if the variable is not available in one of the years
    output <- c(NA,NA); names(output) <- c("coefficient","p-value")}
  else{
    stack_1 <- data.frame(construct=data_1[[var]], weight=data_1[["weight"]], year_23=data_1[["year_23"]], 
                          low_ses=data_1[["low_ses"]],leaid=data_1[["leaid"]])
    stack_2 <- data.frame(construct=data_2[[var]], weight=data_2[["weight"]], year_23=data_2[["year_23"]], 
                          low_ses=data_2[["low_ses"]],leaid=data_2[["leaid"]])
    stack <- rbind(stack_1, stack_2)
    
    regression <- feols(construct ~ year_23*low_ses, cluster=~leaid, data=stack, weights=~weight)
    coeff_B3 <- coef(regression)["year_23:low_ses"]
    p_val_B3 <- pvalue(regression)["year_23:low_ses"]
    output <- c(coeff_B3,p_val_B3); names(output) <- c("coefficient","p-value")}
  return(output)
}

# check work
high_low_statsig(data$data_19,data$data_23,"prti") 

Table["prti","Low_SES_ipr_130"]-Table["prti","High_SES_ipr_130"]-
  high_low_statsig(data$data_19,data$data_23,"prti")["coefficient"] # difference is very small

### get p-vals for each variable
Table_p_vals <- matrix(NA, nrow(Table), ncol(Table)/2)
Table_p_vals <- as.data.frame(Table_p_vals)
colnames(Table_p_vals) <- c("ipr_130", "Black", "income")
rownames(Table_p_vals) <- rownames(Table)

for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  Table_p_vals$ipr_130[i] <- high_low_statsig(data$data_19,data$data_23, var)["p-value"]
}

## d) Drop the low-ses dummies
data$data_19$low_ses <- NULL
data$data_23$low_ses <- NULL

## e) get standard errors for the difference across 2019-2023 
## write function to run statistical significance test, which is clustered by district:
# data_1 is the earlier year, data_2 is the later year
wtd.reg <- function(data_1, data_2, var){ # be sure to type variable name with ""
  if(is.null(data_1[[var]]) | is.null(data_2[[var]])){ # clause that gives an NA if the variable is not available in one of the years
    output <- c(NA,NA); names(output) <- c("coefficient","std error")}
  else{
    stack_1 <- data.frame(construct=data_1[[var]], weight=data_1[["weight"]],leaid=data_1[["leaid"]])
    stack_2 <- data.frame(construct=data_2[[var]], weight=data_2[["weight"]],leaid=data_2[["leaid"]])
    stack_1$later_year_dummy <- 0
    stack_2$later_year_dummy <- 1
    stack <- rbind(stack_1, stack_2)
    
    regression <- feols(construct ~ later_year_dummy,  cluster=~leaid, data=stack, weights=~weight)
    coeff_B1 <- coef(regression)["later_year_dummy"]
    std_error_B1 <- se(regression)["later_year_dummy"]
    output <- c(coeff_B1,std_error_B1); names(output) <- c("coefficient","std error")}
  return(output)
}

# check work
test <- wtd.reg(High_SES$data_19, High_SES$data_23, "colb")
diff_in_means <- Table$High_SES_ipr_130[which(rownames(Table)=="colb")]
test[1] - diff_in_means # should be very small amount

### get std error for each variable and difference - fill in std errors by column 
Table_std_errors <- matrix(NA, nrow(Table), ncol(Table))
Table_std_errors <- as.data.frame(Table_std_errors)
colnames(Table_std_errors) <- colnames(Table)
rownames(Table_std_errors) <- rownames(Table)

# High_SES
for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  Table_std_errors$High_SES_ipr_130[i] <- wtd.reg(High_SES$data_19, High_SES$data_23, var)[2]
}

# Low_SES
for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  Table_std_errors$Low_SES_ipr_130[i] <- wtd.reg(Low_SES$data_19, Low_SES$data_23, var)[2]
}

### 3) Get 2019-2023 differences and statistical significance for sample split by Black ------------------------
## Label the weights you'll use for each year in Table 5 as "weight"
## Split data:
High_SES <- lapply(data, function(x) filter(x, above_median_black==0))
Low_SES <- lapply(data, function(x) filter(x, above_median_black==1))

### a) High_SES 
# put the calculated percentages in a table
weighted_scores <- matrix(NA, length(varnames), 2)
row.names(weighted_scores) <- varnames
colnames(weighted_scores) <- names(data)

for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  weighted_scores[i,] <- sapply(High_SES, function(x) weighted_avg(x, var))
}
weighted_scores <- as.data.frame(weighted_scores)

## Take difference from 2019-2023
Table$High_SES_Black <- weighted_scores$data_23 - weighted_scores$data_19

### b) Low_SES 
# put the calculated percentages in a table
weighted_scores <- matrix(NA, length(varnames), 2)
row.names(weighted_scores) <- varnames
colnames(weighted_scores) <- names(data)

for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  weighted_scores[i,] <- sapply(Low_SES, function(x) weighted_avg(x, var))
}
weighted_scores <- as.data.frame(weighted_scores)

## Take difference from 2019-2023
Table$Low_SES_Black <- weighted_scores$data_23 - weighted_scores$data_19

## c) statistical significance

# first, make low-SES dummies
data$data_19$low_ses <- ifelse(data$data_19$above_median_black==1,1,0)
data$data_23$low_ses <- ifelse(data$data_23$above_median_black==1,1,0)

### get p-vals for each variable
for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  Table_p_vals$Black[i] <- high_low_statsig(data$data_19,data$data_23, var)["p-value"]
}

(Table["prti","Low_SES_Black"]-Table["prti","High_SES_Black"])-high_low_statsig(data$data_19,data$data_23, "prti")["coefficient"]

## d) Drop the low-ses dummies
data$data_19$low_ses <- NULL
data$data_23$low_ses <- NULL

## e) get standard errors for the difference across 2019-2023
# High_SES
for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  Table_std_errors$High_SES_Black[i] <- wtd.reg(High_SES$data_19, High_SES$data_23, var)[2]
}

# Low_SES
for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  Table_std_errors$Low_SES_Black[i] <- wtd.reg(Low_SES$data_19, Low_SES$data_23, var)[2]
}

### 4) Get 2019-2023 differences and statistical significance for sample split by income ------------------------
## Label the weights you'll use for each year in Table 5 as "weight"
## Split data:
High_SES <- lapply(data, function(x) filter(x, above_median_income==1)) # NOTE THAT ORDER SWITCHED - High_SES group 
# is above_median_income==1 because larger values of income mean higher socioeconomic status
Low_SES <- lapply(data, function(x) filter(x, above_median_income==0))

### a) High_SES 
# put the calculated percentages in a table
weighted_scores <- matrix(NA, length(varnames), 2)
row.names(weighted_scores) <- varnames
colnames(weighted_scores) <- names(data)

for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  weighted_scores[i,] <- sapply(High_SES, function(x) weighted_avg(x, var))
}
weighted_scores <- as.data.frame(weighted_scores)

## Take difference from 2019-2023
Table$High_SES_income <- weighted_scores$data_23 - weighted_scores$data_19

### b) Low_SES 
# put the calculated percentages in a table
weighted_scores <- matrix(NA, length(varnames), 2)
row.names(weighted_scores) <- varnames
colnames(weighted_scores) <- names(data)

for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  weighted_scores[i,] <- sapply(Low_SES, function(x) weighted_avg(x, var))
}
weighted_scores <- as.data.frame(weighted_scores)

## Take difference from 2019-2023
Table$Low_SES_income <- weighted_scores$data_23 - weighted_scores$data_19

## c) statistical significance

# first, make low-SES dummies
# NOTE THAT ORDER SWITCHED - Low_SES group is above_median_income==0 because lower values of income mean lower socioeconomic status
data$data_19$low_ses <- ifelse(data$data_19$above_median_income==0,1,0)
data$data_23$low_ses <- ifelse(data$data_23$above_median_income==0,1,0)

### get p-vals for each variable
for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  Table_p_vals$income[i] <- high_low_statsig(data$data_19,data$data_23, var)["p-value"]
}

(Table["prti","Low_SES_income"]-Table["prti","High_SES_income"])-high_low_statsig(data$data_19,data$data_23, "prti")["coefficient"]

## d) Drop the low-ses dummies
data$data_19$low_ses <- NULL
data$data_23$low_ses <- NULL

## e) get standard errors for the difference across 2019-2023
# High_SES
for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  Table_std_errors$High_SES_income[i] <- wtd.reg(High_SES$data_19, High_SES$data_23, var)[2]
}

# Low_SES
for(i in 1:length(varnames)){
  var <- paste(varnames[i])
  Table_std_errors$Low_SES_income[i] <- wtd.reg(Low_SES$data_19, Low_SES$data_23, var)[2]
}

### 5) Check work on statistical significance tests and calculate statistical significance at 5% level:
Table_p_vals>1
Table_p_vals<0
stat_sig <- Table_p_vals<=.05
stat_sig <- as.data.frame(stat_sig)

### 6) Combine differences and statistical significance into single table:

### scale means and std errors by 20 and drop 'data' variable
Table <- Table/20
Table_std_errors <- Table_std_errors/20

Table <- Table[-which(rownames(Table)=='data'),]
Table_std_errors <- Table_std_errors[-which(rownames(Table_std_errors)=='data'),]
stat_sig <- stat_sig[-which(rownames(stat_sig)=='data'),]

# make a dataframe with asterisks where result is statistically significant:
asterisk <- matrix(NA, nrow(stat_sig), ncol(stat_sig))
asterisk <- as.data.frame(asterisk)
colnames(asterisk) <- colnames(stat_sig)
rownames(asterisk) <- rownames(stat_sig)

for(j in 1:ncol(stat_sig)){
  for(i in 1:nrow(stat_sig)){
    if(is.na(stat_sig[i,j])){asterisk[i,j] <- ""}
    else if(stat_sig[i,j]){asterisk[i,j] <- "*"}
    else{asterisk[i,j] <- ""}
  }
}
asterisk; stat_sig

Table_final <- matrix(NA, nrow(Table), ncol(Table))
Table_final <- as.data.frame(Table_final)
colnames(Table_final) <- colnames(Table)
rownames(Table_final) <- rownames(Table)

# round differences and combine with asterisks:
Table_rounded <- round(Table,2)

Table_final[,1] <- sprintf("%s%s", Table_rounded[,1], asterisk[,1])
Table_final[,2] <- sprintf("%s%s", Table_rounded[,2], asterisk[,1])
Table_final[,3] <- sprintf("%s%s", Table_rounded[,3], asterisk[,2])
Table_final[,4] <- sprintf("%s%s", Table_rounded[,4], asterisk[,2])
Table_final[,5] <- sprintf("%s%s", Table_rounded[,5], asterisk[,3])
Table_final[,6] <- sprintf("%s%s", Table_rounded[,6], asterisk[,3])

Table_final

###7) Make dot and whisker plots - export with size 740x575
# need columns 'term', 'estimate', 'std.error' and 'model'
varnames <- read.csv("Teacher Variables.csv", na.strings = c(""))
varnames <- varnames$Variable.definition
varnames <- varnames[!is.na(varnames)]
varnames <- varnames[-which(varnames=="Collective Use of Assessment Data")]
varnames <- c(varnames, "Simple Average")

#### IPR(130) 
plot.data.High_SES_ipr_130 <- data.frame(term=varnames,estimate=Table$High_SES_ipr_130,
                                         std.error=Table_std_errors$High_SES_ipr_130,model="High SES")
plot.data.Low_SES_ipr_130 <- data.frame(term=varnames,estimate=Table$Low_SES_ipr_130,
                                        std.error=Table_std_errors$Low_SES_ipr_130,model="Low SES")

plot.data <- rbind(plot.data.High_SES_ipr_130,plot.data.Low_SES_ipr_130)
plot.data$color <- ifelse(plot.data$estimate>=0,"positive","negative")

# add asterisks to variable names for statistical significance between groups
plot.data$term <- sprintf("%s %s", varnames, asterisk[,"ipr_130"])

dwplot(plot.data, dot_args = list(aes(color=color)), whisker_args = list(aes(color=color)),
       vline = geom_vline(xintercept = 0, colour = "grey50", linetype = 2)) +
  scale_color_manual(values=c("positive"="green4","negative"="red"))+
  facet_wrap(~factor(model, levels=c("High SES","Low SES")),
             ncol=2)+theme(legend.position="none") # >> This is Fig 3

#### Black
plot.data.High_SES_Black <- data.frame(term=varnames,estimate=Table$High_SES_Black,
                                         std.error=Table_std_errors$High_SES_Black,model="High SES")
plot.data.Low_SES_Black <- data.frame(term=varnames,estimate=Table$Low_SES_Black,
                                        std.error=Table_std_errors$Low_SES_Black,model="Low SES")

plot.data <- rbind(plot.data.High_SES_Black,plot.data.Low_SES_Black)
plot.data$color <- ifelse(plot.data$estimate>=0,"positive","negative")

# add asterisks to variable names for statistical significance between groups
plot.data$term <- sprintf("%s %s", varnames, asterisk[,"Black"])

dwplot(plot.data, dot_args = list(aes(color=color)), whisker_args = list(aes(color=color)),
       vline = geom_vline(xintercept = 0, colour = "grey50", linetype = 2)) +
  scale_color_manual(values=c("positive"="green4","negative"="red"))+
  facet_wrap(~factor(model, levels=c("High SES","Low SES")),
             ncol=2)+theme(legend.position="none") # >> This is Fig C1

# Median Household Income
plot.data.High_SES_income <- data.frame(term=varnames,estimate=Table$High_SES_income,
                                         std.error=Table_std_errors$High_SES_income,model="High SES")
plot.data.Low_SES_income <- data.frame(term=varnames,estimate=Table$Low_SES_income,
                                        std.error=Table_std_errors$Low_SES_income,model="Low SES")

plot.data <- rbind(plot.data.High_SES_income,plot.data.Low_SES_income)
plot.data$color <- ifelse(plot.data$estimate>=0,"positive","negative")

# add asterisks to variable names for statistical significance between groups
plot.data$term <- sprintf("%s %s", varnames, asterisk[,"income"])

dwplot(plot.data, dot_args = list(aes(color=color)), whisker_args = list(aes(color=color)),
       vline = geom_vline(xintercept = 0, colour = "grey50", linetype = 2)) +
  scale_color_manual(values=c("positive"="green4","negative"="red"))+
  facet_wrap(~factor(model, levels=c("High SES","Low SES")),
             ncol=2)+theme(legend.position="none") # >> This is Fig C2

